# -*- coding: UTF-8 -*-
import scrapy
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )

import MySQLdb
import re
import datetime

nowDate = datetime.datetime.now().strftime('%Y-%m-%d')

conn= MySQLdb.connect(
        host='127.0.0.1',
        port = 3306,
        user='root',
        passwd='root',
        db ='test',
        charset='utf8'
        )

cur = conn.cursor()

        
class DmozContinueSpider(scrapy.Spider):
    name = "dmozContinue"
    allowed_domains = ["dmoz.org"]
    
    sql = 'select * from newUrl where detail is null'
    start_urls = []
    cur.execute(sql)   
    lines = cur.fetchall()

    for line in lines:
        url = line[1]
        start_urls.append(url)
        
    def parse(self, response):
        
        for sel in response.xpath('//div[@class="content_wrap"]'):
            title = sel.xpath('div[@class="job_header"]/h1/text()').extract()[0]
            
            company = sel.xpath('div[@class="job_header"]/p/text()').extract()[0]  
            
            if len(sel.xpath('//ul[@class="job_list"]/li/span[@class="con"]/text()').extract()) >= 1:
                amount = sel.xpath('//ul[@class="job_list"]/li/span[@class="con"]/text()').extract()[0]
            else:
                amount = ''

            if len(sel.xpath('//ul[@class="job_list"]/li/span[@class="con"]/text()').extract()) >= 2:
                timeline = sel.xpath('//ul[@class="job_list"]/li/span[@class="con"]/text()').extract()[1]
            else:
                timeline = ''
            
            if len(sel.xpath('//ul[@class="job_list"]/li/span[@class="con"]/text()').extract()) >= 3: 
                address = sel.xpath('//ul[@class="job_list"]/li/span[@class="con"]/text()').extract()[2]
            else:
                address = ''
            
            if len(sel.xpath('//ul[@class="job_list"]/li/span[@class="con"]/text()').extract()) >= 4: 
                time = sel.xpath('//ul[@class="job_list"]/li/span[@class="con"]/text()').extract()[3]
            else:
                time = ''
                
            detail = sel.xpath('//div[@id="job_detail"]').extract()[0]
            dr = re.compile(r'<[^>]+>',re.S)
            dd = dr.sub('',detail)
            
            dr = re.compile(r'"',re.S)
            dd = dr.sub('\\"',dd)
            sql = 'update newUrl set title = "'+ str(title) + '" , company = "'+ str(company) + '" , amount = "'+ str(amount) + '" , timeline = "'+ str(timeline) + '" , address = "'+ str(address) + '" , time = "'+ str(time) + '" , detail = "'+ str(dd) + '" where url = "' + response.url + '"'
            #sql = 'update newUrl set title = "'+ str(title)+ '" where url = "' + response.url + '"'
            
            try:
                cur.execute(sql)
                conn.commit()
                print title + '-' + str(nowDate)
            except:
                conn.rollback()
                print title + '-' + str(nowDate) + 'error'
            
            
           
        
#cur.close()
#conn.commit()
#conn.close()